InterCD 2001 May

home *** CD-ROM | disk | FTP | other *** search

/ InterCD 2001 May / may_2001.iso / intercd / root / Multimedia / ^DivX_Article / virtualdub / VirtualDub-source-1_4d / mpeg_idct.cpp < prev next >

Wrap

C/C++ Source or Header | 2001-03-20 | 36.6 KB | 1,682 lines

// VirtualDub - Video processing and capture application // Copyright (C) 1998-2001 Avery Lee // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. /////////////////////////////////////////////////////////////////////////// // // WARNING // // This code is heavily based off of the Java MPEG video player written by // Joerg Anders. Because his code was released under the GNU GPL v2, this // means VirtualDub must also be released under GNU GPL v2 when MPEG // support is included. // // (Like that's any different.) // // This code is really nasty... // /////////////////////////////////////////////////////////////////////////// #include <math.h> #include "mpeg_idct.h" #include "cpuaccel.h" #pragma warning(disable: 4799) // function has no EMMS instruction extern "C" unsigned char YUV_clip_table[]; /////////////////////////////////////////////////////////////////////////// // enable this define for CPU profiling //#define PROFILE #define CYCLES_PER_SECOND (200000000) /////////////////////////////////////////////////////////////////////////// //#define FAST_IMUL #ifdef PROFILE static long profile_start; static long profile_last; static long profile_fastput_cycles; static long profile_fastadd_cycles; static long profile_slow_cycles; static long profile_fastputs; static long profile_fastadds; static long profile_slows; #include <stdio.h> #include <windows.h> extern "C" void ODS(const char *s) { OutputDebugString(s); } void profile_update() { char buf[256]; long pcnt = (profile_fastput_cycles + profile_fastadd_cycles + profile_slow_cycles)/(CYCLES_PER_SECOND/1000); sprintf(buf, "%ld fast puts (%d/idct), %ld fast adds (%d/idct), %ld slows (%d/idct), total %3d.%c%% of CPU\n" ,profile_fastputs ,profile_fastputs ? profile_fastput_cycles/profile_fastputs:0 ,profile_fastadds ,profile_fastadds ? profile_fastadd_cycles/profile_fastadds:0 ,profile_slows ,profile_slows ? profile_slow_cycles/profile_slows:0 ,pcnt/10,pcnt%10 + '0' ); OutputDebugString(buf); profile_last += CYCLES_PER_SECOND; profile_fastputs = profile_fastadds = profile_slows = 0; profile_fastput_cycles = profile_fastadd_cycles = profile_slow_cycles = 0; } #endif #ifdef BE_REALLY_SLOW #define C6 2*sin(pi/8) #define C4C6 2*sqrt(2)*sin(pi/8) #define C4 sqrt(2) #define Q 2*(cos(pi/8)-sin(pi/8)) #define C4Q 2*sqrt(2)*(cos(pi/8)-sin(pi/8)) #define R 2*(cos(pi/8)+sin(pi/8)) #define C4R 2*sqrt(2)*(cos(pi/8)+sin(pi/8)) #else // For some reason, adding +1 to C6 or -1 to R (but not both) is enough // to bring this IDCT into IEEE-1180 compliance. C6+1 is the better // adjustment. #define C6 ((int)(0.7653668647094 * 2048.0 + 0.5))+1 #define C4C6 ((int)(1.082392200292 * 2048.0 + 0.5)) #define C4 ((int)(1.414213562373 * 2048.0 + 0.5)) #define Q ((int)(1.08239220029 * 2048.0 + 0.5)) #define C4Q ((int)(1.53073372946 * 2048.0 + 0.5)) #define R ((int)(2.613125929753 * 2048.0 + 0.5)) #define C4R ((int)(3.69551813004 * 2048.0 + 0.5)) // C6 = 1567 // C4C6 = 2217 // C4 = 2896 // Q = 2217 // C4Q = 3135 // R = 5352 // C4R = 7568 #endif #define MATH_PI (3.141592653589793238462643) static struct { int data[64][64]; } idct_precomputed; int dct_coeff[64]; static int matr1[64], matr2[64]; extern void MJPEG_IDCT(int *); void IDCT_init() { int x, y; for (int i=0; i<64; i++) { double d = 0.25 * 256.0; x = i % 8; y = i / 8; if (!x && !y) d = 0.125 * 256.0; else if (!x || !y) d = 0.125 * sqrt(2) * 256.0; for (int k = 0; k < 8; k++) { for (int l = 0; l < 8; l++) { idct_precomputed.data[i][k * 8 + l] = (int)floor(0.5 + d * cos(MATH_PI * y * (2*k+1) / 16.0) * cos(MATH_PI * x * (2*l+1) / 16.0)); } } #if 0 unsigned char out[64]; for(k=0; k<64; k++) dct_coeff[k] = 0; dct_coeff[i] = 4; IDCT_norm(dct_coeff); dct_coeff[0] += 262144 + 1024; IDCT(out, 8, true); if (i) for(k=0; k<64; k++) if (abs(((int)out[k] - 128) - idct_precomputed.data[i][k]/4)>1) __asm int 3 #endif } } void __declspec(naked) IDCT_fast_put(int pos, void *dst, long pitch) { __asm { push esi push edi push ebp push ebx #ifdef PROFILE rdtsc mov profile_start, eax #endif mov esi,[esp+4+16] ;eax = coefficient # xor ecx,ecx mov eax,[dct_coeff + esi*4] ;ecx = coefficient mov [dct_coeff + esi*4],ecx mov edi,[esp+8+16] ;edi = dest or esi,esi jnz AC_coeff ;we're doing the DC coefficient... easy! sar eax,11 mov ecx,[esp+12+16] ;esi = pitch jns DC_above_zero xor eax,eax DC_above_zero: cmp eax,256 jb DC_below_255 mov eax,255 DC_below_255: test MMX_enabled,1 jnz DC_with_MMX mov ebx,eax mov edx,4 shl eax,8 or ebx,eax mov eax,ebx shl eax,16 or eax,ebx DC_loop: mov [edi+0],eax mov [edi+4],eax mov [edi+ecx+0],eax mov [edi+ecx+4],eax lea edi,[edi+ecx*2] dec edx jne DC_loop jmp short fnexit DC_with_MMX: movd mm6,eax punpcklbw mm6,mm6 punpcklwd mm6,mm6 mov edx,4 punpckldq mm6,mm6 DC_loop_MMX: movq [edi],mm6 movq [edi+ecx],mm6 lea edi,[edi+ecx*2] dec edx jne DC_loop_MMX jmp short fnexit ;AC coefficient... we have to scale the table. Damn. ; ; eax = AC coefficient ; ecx = DC coefficient ; edx = V counter ; esi = table pointer ; ebp = H counter AC_coeff: shl esi,8 mov ecx,dct_coeff add esi,offset idct_precomputed mov dword ptr [dct_coeff],0 shl ecx,8 mov edx,8 test MMX_enabled,1 jnz AC_with_MMX AC_loop_vert: mov ebp,8 AC_loop_horiz: mov ebx,eax imul ebx,[esi] add ebx,ecx sar ebx,19 mov bl,[YUV_clip_table+ebx+256] mov [edi],bl add esi,4 inc edi dec ebp jne AC_loop_horiz mov ebx,[esp+12+16] dec edx lea edi,[edi+ebx-8] jne AC_loop_vert fnexit: #ifdef PROFILE rdtsc mov ebx,eax sub eax,profile_start inc profile_fastputs add profile_fastput_cycles,eax sub ebx,profile_last cmp ebx,CYCLES_PER_SECOND jb notsec call profile_update notsec: #endif pop ebx pop ebp pop edi pop esi ret AC_with_MMX: movd mm6,ecx pxor mm7,mm7 mov ebx,eax and eax,7fff8000h shr eax,15 mov ebp,ebx sar ebp,31 sub eax,ebp movd mm4,eax punpckldq mm6,mm6 mov eax,ebx and ebx,80000000h shr ebx,16 and eax,00007fffh or eax,ebx punpckldq mm4,mm4 movd mm5,eax punpckldq mm5,mm5 AC_loop_vert_MMX: movq mm0,[esi] ;AC pattern #0, #1 movq mm2,[esi+8] ;AC pattern #2, #3 movq mm1,mm0 movq mm3,mm2 pmaddwd mm0,mm4 pmaddwd mm2,mm4 add esi,32 pmaddwd mm1,mm5 pmaddwd mm3,mm5 pslld mm0,15 pslld mm2,15 paddd mm1,mm0 paddd mm3,mm2 paddd mm1,mm6 paddd mm3,mm6 psrad mm1,19 psrad mm3,19 packssdw mm1,mm3 movq mm0,[esi-16] ;AC pattern #4, #5 movq mm2,[esi-8] ;AC pattern #6, #7 movq mm7,mm0 movq mm3,mm2 pmaddwd mm0,mm4 pmaddwd mm2,mm4 pmaddwd mm7,mm5 pmaddwd mm3,mm5 pslld mm0,15 pslld mm2,15 paddd mm0,mm7 paddd mm2,mm3 paddd mm0,mm6 paddd mm2,mm6 psrad mm0,19 psrad mm2,19 packssdw mm0,mm2 packuswb mm1,mm0 mov ebx,[esp+12+16] dec edx movq [edi],mm1 lea edi,[edi+ebx] jne AC_loop_vert_MMX jmp short fnexit } } void __declspec(naked) IDCT_fast_add(int pos, void *dst, long pitch) { __asm { push esi push edi push ebp push ebx #ifdef PROFILE rdtsc mov profile_start, eax #endif mov esi,[esp+4+16] ;eax = coefficient # xor ecx,ecx mov eax,[dct_coeff + esi*4] ;ecx = coefficient mov edi,[esp+8+16] ;edi = dest mov [dct_coeff + esi*4],ecx ;clear coefficient :) or esi,esi jnz AC_coeff ;we're doing the DC coefficient... easy! sar eax,11 mov ebp,[esp+12+16] test MMX_enabled,1 jnz DC_with_MMX add eax,offset YUV_clip_table+256 mov esi,8 push ebp mov ebp,eax xor eax,eax xor ebx,ebx xor ecx,ecx xor edx,edx DC_loop: mov al,[edi+0] mov bl,[edi+4] mov cl,[edi+1] mov dl,[edi+5] mov al,[ebp+eax] mov bl,[ebp+ebx] mov cl,[ebp+ecx] mov dl,[ebp+edx] mov [edi+0],al mov [edi+4],bl mov [edi+1],cl mov [edi+5],dl mov al,[edi+2] mov bl,[edi+6] mov cl,[edi+3] mov dl,[edi+7] mov al,[ebp+eax] mov bl,[ebp+ebx] mov cl,[ebp+ecx] mov dl,[ebp+edx] mov [edi+2],al mov [edi+6],bl mov [edi+3],cl mov [edi+7],dl add edi,[esp] dec esi jne DC_loop pop ebp jmp short fnexit DC_with_MMX: movd mm6,eax pxor mm7,mm7 punpcklbw mm6,mm6 punpcklwd mm6,mm6 mov edx,4 punpckldq mm6,mm6 psubb mm7,mm6 or eax,eax js DC_loop_MMX_sub DC_loop_MMX_add: movq mm0,[edi] movq mm1,[edi+ebp] paddusb mm0,mm6 paddusb mm1,mm6 movq [edi],mm0 movq [edi+ebp],mm1 lea edi,[edi+ebp*2] dec edx jne DC_loop_MMX_add jmp short fnexit DC_loop_MMX_sub: movq mm0,[edi] movq mm1,[edi+ebp] psubusb mm0,mm7 psubusb mm1,mm7 movq [edi],mm0 movq [edi+ebp],mm1 lea edi,[edi+ebp*2] dec edx jne DC_loop_MMX_sub jmp short fnexit ;AC coefficient... we have to scale the table. Damn. ; ; eax = AC coefficient ; ecx = DC coefficient ; edx = V counter ; esi = table pointer ; ebp = H counter AC_coeff: shl esi,8 mov ecx,dct_coeff add esi,offset idct_precomputed mov dword ptr [dct_coeff],0 shl ecx,8 mov edx,8 test MMX_enabled,1 jnz AC_with_MMX push eax AC_loop_vert: mov ebp,8 AC_loop_horiz: mov ebx,[esp] xor eax,eax imul ebx,[esi] add ebx,ecx sar ebx,19 mov al,[edi] mov bl,[YUV_clip_table+ebx+eax+256] mov [edi],bl add esi,4 inc edi dec ebp jne AC_loop_horiz mov ebx,[esp+12+20] dec edx lea edi,[edi+ebx-8] jne AC_loop_vert pop eax fnexit: #ifdef PROFILE rdtsc mov ebx,eax sub eax,profile_start inc profile_fastadds add profile_fastadd_cycles,eax sub ebx,profile_last cmp ebx,CYCLES_PER_SECOND jb notsec call profile_update notsec: #endif pop ebx pop ebp pop edi pop esi ret AC_with_MMX: movd mm6,ecx pxor mm7,mm7 mov ebx,eax and eax,7fff8000h shr eax,15 mov ebp,ebx sar ebp,31 sub eax,ebp movd mm4,eax punpckldq mm6,mm6 mov eax,ebx and ebx,80000000h shr ebx,16 and eax,00007fffh or eax,ebx punpckldq mm4,mm4 movd mm5,eax punpckldq mm5,mm5 AC_loop_vert_MMX: movq mm0,[esi] ;AC pattern #0, #1 movq mm2,[esi+8] ;AC pattern #2, #3 movq mm1,mm0 movq mm3,mm2 pmaddwd mm0,mm4 pmaddwd mm2,mm4 add esi,32 pmaddwd mm1,mm5 pmaddwd mm3,mm5 pslld mm0,15 pslld mm2,15 paddd mm0,mm1 paddd mm2,mm3 pxor mm1,mm1 movd mm3,[edi] paddd mm0,mm6 punpcklbw mm3,mm1 paddd mm2,mm6 movq mm7,[esi-16] ;AC pattern #0, #1 psrad mm0,19 psrad mm2,19 packssdw mm0,mm2 movq mm2,[esi-8] ;AC pattern #2, #3 paddw mm0,mm3 movq mm1,mm7 movq mm3,mm2 pmaddwd mm7,mm4 pmaddwd mm2,mm4 pmaddwd mm1,mm5 pmaddwd mm3,mm5 pslld mm7,15 pslld mm2,15 paddd mm7,mm1 paddd mm2,mm3 pxor mm1,mm1 movd mm3,[edi+4] paddd mm7,mm6 punpcklbw mm3,mm1 paddd mm2,mm6 psrad mm7,19 psrad mm2,19 packssdw mm7,mm2 paddw mm7,mm3 packuswb mm0,mm7 movq [edi],mm0 mov ebx,[esp+12+16] dec edx lea edi,[edi+ebx] jne AC_loop_vert_MMX jmp short fnexit } } #define coeff dct_coeff void IDCT_norm(int *m1) { double d; int i,j; for (j = 0; j < 8; j++) { for (i = 0; i < 8; i++) { d = (double)m1[j*8+i]; if (i == 0 && j == 0) { d /= 8.0; } else if (i == 0 || j == 0) { d /= 4.0 * sqrt(2.0); } else { d /= 4.0; } m1[j*8+i] = (int)floor(2048.0 * 16.0 * d * cos(i * (3.1415926535 / 16.0)) * cos(j * (3.1415926535 / 16.0)) + 0.5); } } } /////////////////////////////////////////////////////////////////////////// void __declspec(naked) IDCT(void *dst, long modulo, int intra) { __asm { push esi push edi push ebp push ebx #ifdef PROFILE rdtsc mov profile_start, eax #endif // compute B1 (horizontal / vertical algoritm): // (the vertical part is in tensor product) // // <0> = [0] // <1> = [4] // <2> = [2] - [6] // <3> = [2] + [6] // <4> = [5] - [3] // <5> = [1] + [7] - ([5] + [3]) // <6> = [1] - [7] // <7> = [1] + [7] + ([5] + [3]) mov ebp,7*4 mov edi,0 idct_B1_loop: mov eax,[coeff+ebp*8+1*4] ;eax = [1] mov ebx,[coeff+ebp*8+7*4] ;ebx = [7] mov [coeff+ebp*8+1*4],edi mov [coeff+ebp*8+7*4],edi mov edx,[coeff+ebp*8+5*4] ;edx = [5] mov esi,[coeff+ebp*8+3*4] ;esi = [3] mov [coeff+ebp*8+5*4],edi mov [coeff+ebp*8+3*4],edi lea ecx,[eax+ebx] ;ecx = [1] + [7] sub eax,ebx ;eax = [1] - [7] lea ebx,[edx+esi] ;ebx = [5] + [3] sub edx,esi ;edx = [5] - [3] mov [matr1+ebp+4*32],edx mov esi,[coeff+ebp*8+2*4] ;esi = [2] lea edx,[ecx+ebx] ;edx = ([1] + [7]) + ([5] + [3]) mov [matr1+ebp+6*32],eax sub ecx,ebx ;ecx = ([1] + [7]) - ([5] + [3]) mov ebx,[coeff+ebp*8+6*4] ;ebx = [6] mov [coeff+ebp*8+2*4],edi mov [coeff+ebp*8+6*4],edi mov [matr1+ebp+7*32],edx mov edx,[coeff+ebp*8+0*4] ;edx = [0] lea eax,[esi+ebx] ;eax = [2] + [6] mov [matr1+ebp+5*32],ecx mov [matr1+ebp+3*32],eax sub esi,ebx ;esi = [2] - [6] mov ecx,[coeff+ebp*8+4*4] ;ecx = [4] mov [matr1+ebp+0*32],edx mov [matr1+ebp+2*32],esi mov [coeff+ebp*8+0*4],edi mov [coeff+ebp*8+4*4],edi mov [matr1+ebp+1*32],ecx sub ebp,4 jnc idct_B1_loop // compute the vertical part and the (tensor product) M: xor ebp,ebp call IDCT_vert_0137 mov ebp,1*8*4 call IDCT_vert_0137 mov ebp,3*8*4 call IDCT_vert_0137 mov ebp,7*8*4 call IDCT_vert_0137 call IDCT_vert_25 call IDCT_vert_46 // co35=matr1[5+32]-matr1[3+32]; // co17=matr1[1+48]-matr1[7+48]; // l2 = co35+co17; // l0 = co35-co17; // co35=matr1[5+48]-matr1[3+48]; // co17=matr1[1+32]-matr1[7+32]; // l1 = co35+co17; // l3 = co35-co17; mov ecx,matr1[(5+32)*4] mov esi,matr1[(1+48)*4] sub ecx,matr1[(3+32)*4] ;ecx = co35 sub esi,matr1[(7+48)*4] ;eax = co17 mov edi,matr1[(5+48)*4] mov ebx,matr1[(1+32)*4] sub edi,matr1[(3+48)*4] ;edi = co35 sub ebx,matr1[(7+32)*4] ;ebx = co17 mov eax,ecx mov edx,edi add ecx,esi ;ecx = l2 = co35+co17 add edi,ebx ;edi = l1 = co35+co17 sub eax,esi ;eax = l0 = co35-co17 sub edx,ebx ;edx = l3 = co35-co17 // g0 = C4*(l0+l1); // g1 = C4*(l0-l1); // g2 = l2<<12; // g3 = l3<<12; shl ecx,12 ;ecx = g2 = l2<<12 mov ebx,eax shl edx,12 ;edx = g3 = l3<<12 add eax,edi ;eax = l0+l1 (a) sub ebx,edi ;ebx = l0-l1 (b) nop #ifdef FAST_IMUL imul eax,2896 imul ebx,2896 #else shl ebx,4 ;b/1 lea esi,[eax*8+eax] ;a/1 lea edi,[ebx*8+ebx] ;b/2 lea esi,[esi*4+esi] ;a/2 lea edi,[edi*4+edi] ;b/3 lea eax,[esi*4+eax] ;a/3 shl eax,4 ;a/4 eax = g0 = C4*(l0+l1) lea ebx,[edi*4+ebx] ;b/4 ebx = g1 = C4*(l0-l1) #endif ; eax = g0 ; ebx = g1 ; ecx = g2 ; edx = g3 // matr2[38] = g1+g3; // matr2[52] = g1-g3; // matr2[36] = g2+g0; // matr2[54] = g2-g0; mov esi,ebx mov edi,ecx add ebx,edx ;ebx = g1+g3 add ecx,eax ;ecx = g2+g0 sub esi,edx ;esi = g1-g3 sub edi,eax ;edi = g2-g0 mov matr2[(6+32)*4],ebx mov matr2[(4+48)*4],esi mov matr2[(4+32)*4],ecx mov matr2[(6+48)*4],edi // tmp = C6*(matr2[32]+matr2[48]); // matr2[32] = -Q*matr2[32]-tmp; // matr2[48] = R*matr2[48]-tmp; // tmp = C6*(matr2[33] + matr2[49]); // matr2[33] = -Q*matr2[33]-tmp; // matr2[49] = R*matr2[49]-tmp; // tmp = C4C6 * (matr2[34] + matr2[50]); // matr2[34] = -C4Q*matr2[34]-tmp; // matr2[50] = C4R*matr2[50]-tmp; // tmp = C6*(matr2[35] + matr2[51]); // matr2[35] = -Q*matr2[35]-tmp; // matr2[51] = R*matr2[51]-tmp; // tmp = C4C6 * (matr2[37] + matr2[53]); // matr2[37] = -C4Q*matr2[37]-tmp; // matr2[53] = C4R*matr2[53]-tmp; // tmp = C6*(matr2[39] + matr2[55]); // matr2[39] = -Q*matr2[39]-tmp; // matr2[55] = R*matr2[55]-tmp; ;--- 0,1 mov eax,matr2[32*4] ;(1) mov ebx,matr2[33*4] ;(2) mov ecx,matr2[48*4] ;(1) mov edx,matr2[49*4] ;(2) add ecx,eax ;u (1) add edx,ebx ;v (2) mov eax,matr2[32*4] ;u (1) mov ebx,matr2[48*4] ;v (1) #ifdef FAST_IMUL imul ecx,1567+1 imul eax,2217 imul ebx,5352 #else lea ebp,[ecx+ecx*2] ;u (1c/1) ecx = C6*(matr2[32] + matr2[48]) nop ;v lea esi,[eax+eax] ;u (1a/1) Q*matr2[32] lea edi,[ebx*4+ebx] ;v (1b/1) R*matr2[48] shl edi,3 ;u (1b/2) nop ;v shl ebp,09H ;u (1c/2) lea esi,[esi*8+eax] ;v (1a/2) shl ecx,05H ;u (1c/3) lea ebx,[edi*4+ebx] ;v (1b/3) lea esi,[esi*4+eax] ;u (1a/3) add ecx,ebp ;v (1c/4) nop ;u lea ebx,[ebx*8+edi] ;v (1b/4) lea esi,[esi*4+eax] ;u (1a/4) nop ;v lea ebx,[ebx*4+edi] ;u (1b/5) nop ;v lea eax,[esi*8+eax] ;u (1a/5) #endif sub ebx,ecx ;v (1) R*matr2[48]-tmp add ecx,eax ;u (1) Q*matr2[32]+tmp xor ecx,-1 ;u (1) mov matr2[48*4],ebx ;v (1) inc ecx ;u (1) mov eax,matr2[33*4] ;v (2) mov matr2[32*4],ecx ;u (1) mov ebx,matr2[49*4] ;v (2) #ifdef FAST_IMUL imul edx,1567+1 imul eax,2217 imul ebx,5352 #else lea ebp,[edx+edx*2] ;u (2c/1) edx = C6*(matr2[33] + matr2[49]) nop ;v lea esi,[eax+eax] ;u (2a/1) Q*matr2[33] lea edi,[ebx*4+ebx] ;v (2b/1) R*matr2[49] shl edi,3 ;u (2b/2) nop ;v shl ebp,09H ;u (2c/2) lea esi,[esi*8+eax] ;v (2a/2) nop ;u lea ebx,[edi*4+ebx] ;v (2b/3) lea esi,[esi*4+eax] ;u (2a/3) nop ;v shl edx,05H ;u (2c/3) lea ebx,[ebx*8+edi] ;v (2b/4) lea esi,[esi*4+eax] ;u (2a/4) add edx,ebp ;v (2c/4) lea ebx,[ebx*4+edi] ;u (2b/5) nop ;v lea eax,[esi*8+eax] ;u (2a/5) #endif sub ebx,edx ;v (2) R*matr2[49]-tmp add edx,eax ;u (2) Q*matr2[33]+tmp xor edx,-1 ;u (2) mov matr2[49*4],ebx ;v (2) inc edx ;u (2) mov eax,matr2[35*4] ;v (1) [3,7] mov matr2[33*4],edx ;u (2) mov ebx,matr2[39*4] ;v (2) [3,7] ;--- 3, 7 mov ecx,matr2[51*4] ;(1) mov edx,matr2[55*4] ;(2) add ecx,eax ;(1) add edx,ebx ;(2) mov eax,matr2[35*4] ;(1) mov ebx,matr2[51*4] ;(2) #ifdef FAST_IMUL imul ecx,1567+1 imul eax,2217 imul ebx,5352 #else lea ebp,[ecx+ecx*2] ;u (1c/1) ecx = C6*(matr2[32] + matr2[48]) nop ;v lea esi,[eax+eax] ;u (1a/1) Q*matr2[32] lea edi,[ebx*4+ebx] ;v (1b/1) R*matr2[48] shl edi,3 ;u (1b/2) nop ;v shl ebp,09H ;u (1c/2) lea esi,[esi*8+eax] ;v (1a/2) nop ;u lea ebx,[edi*4+ebx] ;v (1b/3) lea esi,[esi*4+eax] ;u (1a/3) nop shl ecx,05H ;u (1c/3) lea ebx,[ebx*8+edi] ;v (1b/4) lea esi,[esi*4+eax] ;u (1a/4) add ecx,ebp ;v (1c/4) lea ebx,[ebx*4+edi] ;u (1b/5) nop ;v lea eax,[esi*8+eax] ;u (1a/5) #endif sub ebx,ecx ;u (1) R*matr2[48]-tmp add ecx,eax ;v (1) Q*matr2[32]+tmp xor ecx,-1 ;u (1) mov matr2[51*4],ebx ;v (1) inc ecx ;u (1) mov eax,matr2[39*4] ;v (2) mov matr2[35*4],ecx ;u (1) mov ebx,matr2[55*4] ;v (2) #ifdef FAST_IMUL imul edx,1567+1 imul eax,2217 imul ebx,5352 #else lea ebp,[edx+edx*2] ;u (2c/1) edx = C6*(matr2[33] + matr2[49]) nop ;v lea esi,[eax+eax] ;u (2a/1) Q*matr2[33] lea edi,[ebx*4+ebx] ;v (2b/1) R*matr2[49] shl edi,3 ;u (2b/2) nop ;v shl ebp,09H ;u (2c/2) lea esi,[esi*8+eax] ;v (2a/2) nop ;u lea ebx,[edi*4+ebx] ;v (2b/3) lea esi,[esi*4+eax] ;u (2a/3) nop ;v shl edx,05H ;u (2c/3) lea ebx,[ebx*8+edi] ;v (2b/4) lea esi,[esi*4+eax] ;u (2a/4) add edx,ebp ;v (2c/4) lea ebx,[ebx*4+edi] ;u (2b/5) nop ;v lea eax,[esi*8+eax] ;u (2a/5) #endif sub ebx,edx ;v (2) R*matr2[49]-tmp add edx,eax ;u (2) Q*matr2[33]+tmp xor edx,-1 ;u (2) mov matr2[55*4],ebx ;v (2) inc edx ;u (2) mov eax,matr2[34*4] ;v (1) [2,5] mov matr2[39*4],edx ;u (2) mov ebx,matr2[37*4] ;v (2) [2,5] ;--- 2,5 mov ecx,matr2[50*4] ;u (1) mov edx,matr2[53*4] ;v (2) add ecx,eax ;u (1) add edx,ebx ;v (2) mov eax,matr2[34*4] ;u (1) mov ebx,matr2[50*4] ;v (1) #ifdef FAST_IMUL imul ebx,7568 imul eax,3135 imul ecx,2217 #else lea edi,[ebx*4+ebx] ;u (1b/1) C4R*matr2[50] lea esi,[eax*2+eax] ;v (1a/1) C4Q*matr2[34] lea ebp,[ecx+ecx] ;u (1c/1) ecx = C4C6*(matr2[32] + matr2[48]) nop lea ebx,[ebx*8+edi] ;u (1b/2) lea eax,[esi*8] ;v (1a/2) lea ebp,[ebp*8+ecx] ;u (1c/2) nop lea ebx,[ebx*8+ebx] ;u (1b/3) lea eax,[eax*8+esi] ;v (1a/3) lea ebp,[ebp*4+ecx] ;u (1c/3) nop lea ebx,[ebx*4+edi] ;u (1b/4) lea eax,[eax*4+esi] ;v (1a/4) lea ebp,[ebp*4+ecx] ;u (1c/4) nop shl ebx,4 ;u (1b/5) lea eax,[eax*4+esi] ;v (1a/5) lea ecx,[ebp*8+ecx] ;u (1c/5) #endif sub ebx,ecx ;u (1) C4R*matr2[50]-tmp add ecx,eax ;v (1) C4Q*matr2[34]+tmp xor ecx,-1 ;u (1) mov matr2[50*4],ebx ;v (1) mov eax,matr2[37*4] ;u (2) inc ecx ;v (1) mov ebx,matr2[53*4] ;u (2) mov matr2[34*4],ecx ;v (1) #ifdef FAST_IMUL imul ebx,7568 imul eax,3135 imul edx,2217 #else lea ebp,[edx+edx] ;v (2c/1) edx = C4C6*(matr2[33] + matr2[49]) nop ;u lea esi,[eax*2+eax] ;v (2a/1) C4Q*matr2[37] lea edi,[ebx*4+ebx] ;u (2b/1) C4R*matr2[53] lea ebp,[ebp*8+edx] ;u (2c/2) nop lea ebx,[ebx*8+edi] ;u (2b/2) lea eax,[esi*8] ;v (2a/2) lea ebp,[ebp*4+edx] ;u (2c/3) nop lea ebx,[ebx*8+ebx] ;u (2b/3) lea eax,[eax*8+esi] ;(2a/3) lea ebp,[ebp*4+edx] ;u (2c/4) nop lea ebx,[ebx*4+edi] ;u (2b/4) lea eax,[eax*4+esi] ;(2a/4) lea edx,[ebp*8+edx] ;u (2c/5) nop shl ebx,4 ;u (2b/5) lea eax,[eax*4+esi] ;(2a/5) #endif add eax,edx ;u (2) C4Q*matr2[37]+tmp sub ebx,edx ;v (2) C4R*matr2[53]-tmp xor eax,-1 ;u (2) mov matr2[53*4],ebx ;v (2) inc eax ;u (2) mov matr2[37*4],eax ;u (2) ;--- done // compute A1 x A2 x A3 (horizontal/vertical algoritm): mov ebp,7*4 idct_A1A2_loop: mov edi,[matr2+ebp*8+0*4] ;edi = [0] mov eax,[matr2+ebp*8+1*4] ;eax = [1] mov ebx,[matr2+ebp*8+3*4] ;ebx = [3] mov ecx,[matr2+ebp*8+2*4] ;ecx = [2] lea esi,[edi+eax] ;esi = [0]+[1] sub edi,eax ;edi = [0]-[1] mov edx,[matr2+ebp*8+7*4] ;edx = [7] sub ecx,ebx ;ecx = [2]-[3] lea eax,[esi+ebx] ;eax = [0]+[1]+[3] mov ebx,edi ;ebx = [0]-[1] add ebx,ecx ;ebx = [0]-[1]+[2]-[3] sub edi,ecx ;edi = [0]-[1]-[2]+[3] lea ecx,[eax+edx] ;ecx = [0]+[1]+[3]+[7] sub eax,edx ;eax = [0]+[1]+[3]-[7] sub edx,[matr2+ebp*8+6*4] ;edx = -[6]+[7] sub esi,[matr2+ebp*8+3*4] ;esi = [0]+[1]+[3] mov [matr1+ebp+0*32],ecx mov ecx,[matr2+ebp*8+5*4] ;ecx = [5] mov [matr1+ebp+7*32],eax lea eax,[ebx+edx] ;eax = [0]-[1]+[2]-[3]-[6]+[7] sub ebx,edx ;ebx = [0]-[1]+[2]-[3]+[6]-[7] mov [matr1+ebp+6*32],eax add edx,ecx ;edx = [5]-[6]+[7] mov ecx,[matr2+ebp*8+4*4] ;eax = [4] mov [matr1+ebp+1*32],ebx add ecx,edx ;ecx = [4]+[5]-[6]+[7] lea ebx,[edi+edx] ;ebx = [0]-[1]-[2]+[3]+[5]-[6]+[7] sub edi,edx ;edi = [0]-[1]-[2]+[3]-[5]+[6]-[7] lea eax,[esi+ecx] ;eax = [0]+[1]-[3]+[4]+[5]-[6]+[7] mov [matr1+ebp+2*32],ebx mov [matr1+ebp+5*32],edi sub esi,ecx ;esi = [0]+[1]-[3]-[4]-[5]+[6]-[7] mov [matr1+ebp+4*32],eax mov [matr1+ebp+3*32],esi sub ebp,4 jnc idct_A1A2_loop ;****************************************** mov ebp,-8*32 mov esi,[esp+4+16] mov eax,esi test dword ptr [esp+12+16],-1 jnz idct_final_loop_intra idct_final_loop_inter: mov eax,[matr1+ebp+8*32+0*4] ;eax = [0] mov ebx,[matr1+ebp+8*32+1*4] ;ebx = [1] mov edi,[matr1+ebp+8*32+3*4] ;edi = [3] add eax,ebx ;eax = [0] + [1] add eax,edi ;eax = [0] + [1] + [3] mov edx,[matr1+ebp+8*32+7*4] ;edx = [7] mov ebx,eax ;ebx = [0] + [1] + [3] add eax,edx ;eax = [0] + [1] + [3] + [7] sub ebx,edx ;ebx = [0] + [1] + [3] - [7] mov edx,eax xor ecx,ecx sar edx,22 mov cl,[esi+0] mov dl,[YUV_clip_table+edx+ecx+256] mov cl,[esi+7] mov [esi+0],dl mov edx,ebx sar edx,22 mov dl,[YUV_clip_table+edx+ecx+256] mov [esi+7],dl mov edi,[matr1+ebp+8*32+1*4] ;edi = [1] mov edx,[matr1+ebp+8*32+3*4] ;edx = [3] mov ecx,[matr1+ebp+8*32+2*4] ;ecx = [2] add edi,edx ;edi = [1] + [3] mov edx,[matr1+ebp+8*32+6*4] ;esi = [6] add edi,edi ;edi = 2[1] + 2[3] sub eax,edx ;eax = [0] + [1] + [3] - [6] + [7] sub edi,ecx ;edi = 2[1] - [2] + 2[3] add ebx,edx ;ebx = [0] + [1] + [3] + [6] - [7] sub eax,edi ;eax = [0] - [1] + [2] - [3] - [6] + [7] sub ebx,edi ;ebx = [0] - [1] + [2] - [3] + [6] - [7] mov ecx,eax xor edx,edx sar ecx,22 mov dl,[esi+6] mov cl,[YUV_clip_table+ecx+edx+256] mov dl,[esi+1] mov [esi+6],cl mov ecx,ebx sar ecx,22 mov cl,[YUV_clip_table+ecx+edx+256] mov [esi+1],cl mov edx,[matr1+ebp+8*32+2*4] sub edx,[matr1+ebp+8*32+3*4] ;edx = [2] - [3] mov ecx,[matr1+ebp+8*32+5*4] add edx,edx ;edx = 2[2] - 2[3] add eax,ecx ;eax = [0] - [1] + [2] - [3] + [5] - [6] + [7] sub ebx,ecx ;ebx = [0] - [1] + [2] - [3] - [5] + [6] - [7] sub eax,edx ;eax = [0] - [1] - [2] + [3] + [5] - [6] + [7] sub ebx,edx ;ebx = [0] - [1] - [2] + [3] - [5] + [6] - [7] mov edx,eax sar edx,22 xor ecx,ecx mov cl,[esi+2] mov dl,[YUV_clip_table+ecx+edx+256] mov cl,[esi+5] mov [esi+2],dl mov edx,ebx sar edx,22 mov dl,[YUV_clip_table+ecx+edx+256] mov [esi+5],dl mov ecx,[matr1+ebp+8*32+1*4] shl ecx,2 ;esi = 4[1] mov edx,[matr1+ebp+8*32+4*4] add eax,edx ;eax = [0] - [1] - [2] + [3] + [4] + [5] - [6] + [7] sub edi,ecx ;edi = -2[1] - [2] + 2[3] sub ebx,edx ;ebx = [0] - [1] - [2] + [3] - [4] - [5] + [6] - [7] sub eax,edi ;eax = [0] + [1] - [3] - [4] + [5] - [6] + [7] sar eax,22 sub ebx,edi ;ebx = [0] + [1] - [3] + [4] - [5] + [6] - [7] sar ebx,22 xor ecx,ecx xor edx,edx mov edi,[esp+8+16] mov cl,[esi+4] mov dl,[esi+3] mov al,[YUV_clip_table+eax+ecx+256] mov bl,[YUV_clip_table+ebx+edx+256] mov [esi+4],al mov [esi+3],bl add esi,edi add ebp,32 jnz idct_final_loop_inter finish: #ifdef PROFILE rdtsc mov ebx,eax sub eax,profile_start inc profile_slows add profile_slow_cycles,eax sub ebx,profile_last cmp ebx,CYCLES_PER_SECOND jb notsec call profile_update notsec: #endif pop ebx pop ebp pop edi pop esi ret align 16 idct_final_loop_intra: mov cl,[eax] mov eax,[matr1+ebp+8*32+4*0] ;eax = [0] mov ebx,[matr1+ebp+8*32+4*1] ;ebx = [1] mov esi,[matr1+ebp+8*32+4*3] ;esi = [3] mov edx,[matr1+ebp+8*32+4*2] ;edx = [2] lea ecx,[eax+ebx] ;ecx = [0]+[1] sub eax,ebx ;eax = [0]-[1] add ecx,esi ;ecx = [0]+[1]+[3] mov ebx,[matr1+ebp+8*32+4*7] ;ebx = [7] sub esi,edx ;esi = [3]-[2] mov edi,[matr1+ebp+8*32+4*6] ;edi = [6] sub eax,esi ;eax = [0]-[1]+[2]-[3] lea edx,[ecx+ebx] ;edx = [0]+[1]+[3]+[7] sub ecx,ebx ;ecx = [0]+[1]+[3]-[7] sub ebx,edi ;ebx = -[6]+[7] sar edx,22 ;edx = <0> mov edi,[esp+4+16] sar ecx,22 ;ecx = <7> mov dl,[YUV_clip_table+edx+256] ;dl = FINAL[0] mov dh,[YUV_clip_table+ecx+256] ;dh = FINAL[7] lea ecx,[eax+ebx] ;ecx = [0]-[1]+[2]-[3]-[6]+[7] lea esi,[esi*2+eax] ;esi = [0]-[1]-[2]+[3] mov [edi+0],dl mov [edi+7],dh sub eax,ebx ;eax = [0]-[1]+[2]-[3]+[6]-[7] sar eax,22 mov edx,[matr1+ebp+8*32+4*5] ;edi = [5] sar ecx,22 add ebx,edx ;ebx = [5]-[6]+[7] mov al,[YUV_clip_table+eax+256] ;al = FINAL[1] mov edx,[matr1+ebp+8*32+4*0] ;edx = [0] mov ah,[YUV_clip_table+ecx+256] ;ah = FINAL[6] lea ecx,[esi+ebx] ;ecx = [0]-[1]-[2]+[3]+[5]-[6]+[7] sub esi,ebx ;esi = [0]-[1]-[2]+[3]-[5]+[6]-[7] mov [edi+1],al mov [edi+6],ah mov eax,[matr1+ebp+8*32+4*1] ;eax = [1] sar ecx,22 add edx,eax ;edx = [0]+[1] sar esi,22 mov eax,[matr1+ebp+8*32+4*3] ;eax = [3] sub edx,eax ;edx = [0]+[1]-[3] mov eax,[matr1+ebp+8*32+4*4] ;eax = [4] mov cl,[YUV_clip_table+ecx+256] ;cl = FINAL[2] add ebx,eax ;ebx = [4]+[5]-[6]+[7] mov ch,[YUV_clip_table+esi+256] ;ch = FINAL[5] mov eax,[esp+8+16] add eax,edi lea esi,[edx+ebx] ;esi = [0]+[1]-[3]+[4]+[5]-[6]+[7] sar esi,22 sub edx,ebx ;edx = [0]+[1]-[3]-[4]-[5]+[6]-[7] sar edx,22 mov [edi+2],cl mov [edi+5],ch mov bl,[YUV_clip_table+esi+256] mov cl,[YUV_clip_table+edx+256] mov [edi+4],bl mov [edi+3],cl add ebp,32 mov [esp+4+16],eax jnz idct_final_loop_intra jmp short finish } IDCT_vert_0137: __asm { // tmp4 = matr1[3+p]-matr1[5+p]; // tmp6 = matr1[1+p]-matr1[7+p]; // tmp = C6 * (tmp6-tmp4); // matr2[p+4] = Q*tmp4-tmp; // matr2[p+6] = R*tmp6-tmp; mov eax,matr1[ebp+3*4] ;eax = matr1[3+p] mov ebx,matr1[ebp+1*4] ;ebx = matr1[1+p] sub eax,matr1[ebp+5*4] ;eax = tmp4 = matr1[3+p] - matr1[5+p] sub ebx,matr1[ebp+7*4] ;ebx = tmp6 = matr1[1+p] - matr1[7+p] #ifdef FAST_IMUL mov ecx,ebx sub ecx,eax ;ecx = C6*(tmp6-tmp4) imul ecx,1567+1 imul eax,2217 imul ebx,5352 #else lea esi,[eax+eax] ;(a/1) Q*tmp4 lea edi,[ebx*4+ebx] ;(b/1) R*tmp6 shl edi,3 ;(b/2) mov ecx,ebx lea esi,[esi*8+eax] ;(a/2) sub ecx,eax ;ecx = tmp6-tmp4 nop lea ebx,[edi*4+ebx] ;(b/3) lea esi,[esi*4+eax] ;(a/3) lea edx,[ecx+ecx*2] ;(c/1) eax = C6*(tmp6-tmp4) shl edx,09H ;(c/2) lea ebx,[ebx*8+edi] ;(b/4) nop ; lea esi,[esi*4+eax] ;(a/4) shl ecx,05H ;(c/3) lea ebx,[ebx*4+edi] ;(b/5) lea eax,[esi*8+eax] ;(a/5) add ecx,edx ;(c/4) #endif sub eax,ecx ;eax = Q*tmp4 - tmp; sub ebx,ecx ;ebx = R*tmp6 - tmp; mov matr2[ebp+4*4],eax ;matr2[p+4] = Q*tmp4 - tmp; mov matr2[ebp+6*4],ebx ;matr2[p+6] = R*tmp6 - tmp; // co17 = matr1[p+1] + matr1[p+7]; // co35 = matr1[p+3] + matr1[p+5]; // matr2[p+5] = (co17-co35)*C4; // matr2[p+7] = (co17+co35)<<11; // matr2[p+2] = (matr1[p+2]-matr1[p+6])*C4; // matr2[p+3] = (matr1[p+2]+matr1[p+6]) << 11; mov eax,matr1[ebp+1*4] ;eax = matr1[p+1] mov ebx,matr1[ebp+3*4] ;ebx = matr1[p+3] add eax,matr1[ebp+7*4] ;eax = co17 = matr1[p+1] + matr1[p+7] add ebx,matr1[ebp+5*4] ;ebx = co35 = matr1[p+3] + matr1[p+5] mov ecx,ebx ;ecx = co35 add ebx,eax ;ebx = co17 + co35 shl ebx,11 ;ebx = (co17 + co35)<<11 sub eax,ecx ;eax = co17 - co35 mov matr2[ebp+7*4],ebx ;matr2[p+7] = (co17 + co35)<<11 mov ebx,matr1[ebp+2*4] ;ebx = matr1[p+2] mov edx,matr1[ebp+6*4] ;edx = matr1[p+6] mov ecx,edx ;ecx = matr1[p+6] add edx,ebx ;edx = matr1[p+2] + matr1[p+6] shl edx,11 ;edx = (matr1[p+2] + matr1[p+6])<<11 sub ebx,ecx ;ebx = matr1[p+2] - matr1[p+6] mov matr2[ebp+3*4],edx ;matr2[p+3] = (matr1[p+2] + matr1[p+6])<<11 ;multiply eax, ebx by C4 #ifdef FAST_IMUL imul eax,2896 imul ebx,2896 #else shl ebx,4 lea esi,[eax*8+eax] lea esi,[esi*4+esi] lea edi,[ebx*8+ebx] lea eax,[esi*4+eax] lea edi,[edi*4+edi] shl eax,4 lea ebx,[edi*4+ebx] #endif // matr2[p+0] = matr1[p+0] << 11; // matr2[p+1] = matr1[p+4] << 11; mov ecx,matr1[ebp+0*4] mov edx,matr1[ebp+4*4] shl ecx,11 mov matr2[ebp+5*4],eax shl edx,11 mov matr2[ebp+2*4],ebx mov matr2[ebp+0*4],ecx mov matr2[ebp+1*4],edx ret }; IDCT_vert_25: __asm { mov ebp,2*8*4 call IDCT_vert_25_dorow mov ebp,5*8*4 IDCT_vert_25_dorow: // tmp4 = matr1[p+3]-matr1[p+5]; // tmp6 = matr1[p+1]-matr1[p+7]; mov eax, matr1[ebp+3*4] mov ebx, matr1[ebp+1*4] sub eax, matr1[ebp+5*4] ;eax = tmp4 sub ebx, matr1[ebp+7*4] ;ebx = tmp6 // tmp = C4C6 * (tmp6-tmp4); c // matr2[p+4] = C4Q*tmp4-tmp; a // matr2[p+6] = C4R*tmp6-tmp; b #ifdef FAST_IMUL mov ecx,ebx sub ecx,eax ;ecx = tmp6 - tmp4 imul ecx,2217 ;ecx = C4C6*(tmp6 - tmp4) imul eax,3135 imul ebx,7568 #else mov ecx,ebx nop lea esi,[eax*2+eax] ;a/1 lea edi,[ebx*4+ebx] ;b/1 sub ecx,eax ;ecx = tmp6 - tmp4 nop lea eax,[esi*8] ;a/2 lea ebx,[ebx*8+edi] ;b/2 nop lea edx,[ecx+ecx] ;c/1 lea eax,[eax*8+esi] ;a/3 lea ebx,[ebx*8+ebx] ;b/3 nop lea edx,[edx*8+ecx] ;c/2 lea eax,[eax*4+esi] ;a/4 lea ebx,[ebx*4+edi] ;b/4 shl ebx,4 ;b/5 lea edx,[edx*4+ecx] ;c/3 lea eax,[eax*4+esi] ;a/5 lea edx,[edx*4+ecx] ;c/4 lea ecx,[edx*8+ecx] ;c/5 #endif sub eax,ecx sub ebx,ecx mov matr2[ebp+4*4],eax mov matr2[ebp+6*4],ebx // co17 = co1 + co7; // co35 = co3 + co5; // matr2[p+5] = (co17-co35)<<12; // matr2[p+7] = (co17+co35)*C4; (a) // co2=matr1[p+2]; // co6=matr1[p+6]; // matr2[p+2] = (co2-co6)<<12; // matr2[p+3] = (co2+co6)*C4; (b) mov eax,matr1[ebp+1*4] mov ebx,matr1[ebp+3*4] mov ecx,matr1[ebp+2*4] mov edx,matr1[ebp+6*4] add eax,matr1[ebp+7*4] ;eax = co17 add ebx,matr1[ebp+5*4] ;ebx = co35 mov esi,eax add eax,ebx ;eax = co17+co35 mov edi,ecx sub esi,ebx shl esi,12 add ecx,edx sub edi,edx shl edi,12 mov matr2[ebp+5*4],esi mov matr2[ebp+2*4],edi #ifdef FAST_IMUL imul ecx,2896 imul eax,2896 #else shl ecx,4 ;b/1 lea esi,[eax*8+eax] ;a/1 lea esi,[esi*4+esi] ;a/2 lea edi,[ecx*8+ecx] ;b/2 lea eax,[esi*4+eax] ;a/3 lea edi,[edi*4+edi] ;b/3 shl eax,4 ;a/4 lea ecx,[edi*4+ecx] ;b/4 #endif mov matr2[ebp+7*4],eax mov matr2[ebp+3*4],ecx // matr2[p+0] = C4*matr1[p ]; (a) // matr2[p+1] = C4*matr1[p+4]; (b) mov eax,matr1[ebp+0*4] mov ebx,matr1[ebp+4*4] #ifdef FAST_IMUL imul ebx,2896 imul eax,2896 #else shl ebx,4 ;b/1 lea esi,[eax*8+eax] ;a/1 lea esi,[esi*4+esi] ;a/2 lea edi,[ebx*8+ebx] ;b/2 lea eax,[esi*4+eax] ;a/3 lea edi,[edi*4+edi] ;b/3 shl eax,4 ;a/4 lea ebx,[edi*4+ebx] ;b/4 #endif mov matr2[ebp+0*4],eax mov matr2[ebp+1*4],ebx ret } IDCT_vert_46: __asm { mov ebp,4*8*4 call IDCT_vert_46_dorow mov ebp,6*8*4 IDCT_vert_46_dorow: mov eax,matr1[ebp+0*4] mov ebx,matr1[ebp+4*4] mov matr2[ebp+0*4],eax mov matr2[ebp+1*4],ebx mov esi,matr1[ebp+2*4] mov edi,matr1[ebp+6*4] mov eax,matr1[ebp+1*4] mov ebx,matr1[ebp+3*4] add eax,matr1[ebp+7*4] add ebx,matr1[ebp+5*4] lea edx,[esi+edi] sub esi,edi lea ecx,[eax+ebx] sub eax,ebx mov matr2[ebp+2*4],esi mov matr2[ebp+3*4],edx mov matr2[ebp+5*4],eax mov matr2[ebp+7*4],ecx ret }; }